# The output of this cell should be committed to the repo such that the styles are applied after
# opening the notebook

from IPython.core.display import HTML

with open("rise.css", "r") as f:
    styles = f.read()
HTML(f'<style>{styles}</style>')
import numpy as np
import itertools as it
from tqdm import tqdm

import matplotlib
from matplotlib import pyplot as plt
import plotly.express as px
import pandas as pd

import ipywidgets as widgets

from tfl_training_anomaly_detection.exercise_tools import evaluate, get_kdd_data, get_house_prices_data, create_distributions, contamination, \
perform_rkde_experiment, get_mnist_data

from ipywidgets import interact

from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelBinarizer
from sklearn.ensemble import IsolationForest
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.neighbors import KernelDensity

from tfl_training_anomaly_detection.vae import VAE, build_decoder_mnist, build_encoder_minst, build_contaminated_minst

from tensorflow import keras

%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (5, 5)

Snow

Anomaly Detection

Anomaly Detection via Density Estimation

Idea: Estimate the density of $F_0$. Areas of low density are anomalous.

  • Often $p$ is too small to estimate complete mixture model
  • Takes into account that $F_1$ might not be well-defined
  • Estimation procedure needs to be robust against contamination if no clean training data is available

Kernel Density Estimation

  • Non-parametric method
  • Can represent almost arbitrarily shaped densities
  • Each training point "spreads" a fraction of the probability mass as specified by the kernel function

Definition:

  • $K: \mathbb{R} \to \mathbb{R}$ kernel function
    • $K(r) \geq 0$ for all $r\in \mathbb{R}$
    • $\int_{-\infty}^{\infty} K(r) dr = 1$
  • $h > 0$ bandwidth
  • Bandwidth is the most crucial parameter


Definition: Let $D = \{x_1,\ldots,x_N\}\subset \mathbb{R}^p$. The KDE with kernel $K$ and bandwidth $h$ is $KDE_h(x, D) = \frac{1}{N}\sum_{i=1}^N \frac{1}{h^p}K\left(\frac{|x-x_i|}{h}\right)$


Effect of bandwidth and kernel

Exercise

Play with the parameters!

dists = create_distributions(dim=2, dim_irrelevant=0)

sample_train = dists['Double Blob'].sample(500)
X_train = sample_train[-1]
y_train = [0]*len(X_train)

plt.scatter(X_train[:,0], X_train[:,1], c = 'blue', s=10)
plt.show()
2023-03-27 17:59:17.581218: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
# Helper function
def fit_kde(kernel, bandwidth, X_train):
    kde = KernelDensity(kernel=kernel, bandwidth=bandwidth)
    kde.fit(X_train)
    return kde

def visualize_kde(kde, bandwidth, X_test, y_test):
    fig, axis = plt.subplots(figsize=(5, 5))

    lin = np.linspace(-10, 10, 50)
    grid_points = list(it.product(lin, lin))
    ys, xs = np.meshgrid(lin, lin)
    # The score function of sklearn returns log-densities
    scores = np.exp(kde.score_samples(grid_points)).reshape(50, 50)
    colormesh = axis.contourf(xs, ys, scores)
    fig.colorbar(colormesh)
    axis.set_title('Density Conturs (Bandwidth={})'.format(bandwidth))
    axis.set_aspect('equal')
    color = ['blue' if i ==0 else 'red' for i in y_test]
    plt.scatter(X_test[:, 0], X_test[:, 1], c=color)
    plt.show()

Choose KDE Parameters

ker = None
bdw = None
@interact(
    kernel=['gaussian', 'tophat', 'epanechnikov', 'exponential', 'linear', 'cosine'],
    bandwidth=(.1, 10.)
)
def set_kde_params(kernel, bandwidth):
    global ker, bdw

    ker = kernel
    bdw = bandwidth
kde = fit_kde(ker, bdw, X_train)
visualize_kde(kde, bdw, X_train, y_train)

Bandwidth Selection

The bandwidth is the most important parameter of a KDE model. A wrongly adjusted value will lead to over- or under-smoothing of the density curve.

A common method to select a bandwidth is maximum log-likelihood cross validation. $$h_{\textrm{llcv}} = \arg\max_{h}\frac{1}{k}\sum_{i=1}^k\sum_{y\in D_i}\log\left(\frac{k}{N(k-1)}\sum_{x\in D_{-i}}K_h(x, y)\right)$$ where $D_{-i}$ is the data without the $i$th cross validation fold $D_i$.

Exercises

ex no.1: Noisy sinusoidal

# Generate example
dists = create_distributions(dim=2)

distribution_with_anomalies = contamination(
    nominal=dists['Sinusoidal'],
    anomaly=dists['Blob'],
    p=0.05
)

# Train data
sample_train = dists['Sinusoidal'].sample(500)
X_train = sample_train[-1].numpy()

# Test data
sample_test = distribution_with_anomalies.sample(500)
X_test = sample_test[-1].numpy()
y_test = sample_test[0].numpy()

scatter = plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test)
handels, _ = scatter.legend_elements()
plt.legend(handels, ['Nominal', 'Anomaly'])
plt.gca().set_aspect('equal')
plt.show()
####################################################################################################################
# TODO: Define the search space for the kernel and the bandwidth
####################################################################################################################
param_space = {
    'kernel': ['gaussian', 'tophat', 'epanechnikov', 'exponential', 'linear', 'cosine'], # Add available kernels
    'bandwidth': np.linspace(0.1, 10, 100), # Define Search space for bandwidth parameter
}

def hyperopt_by_score(X_train, param_space, cv=5):
    kde = KernelDensity()

    search = RandomizedSearchCV(
        estimator=kde,
        param_distributions=param_space,
        n_iter=100,
        cv=cv,
        scoring=None # use estimators internal scoring function, i.e. the log-probability of the validation set for KDE
    )

    search.fit(X_train)
    return search.best_params_, search.best_estimator_

Run the code below to perform hyperparameter optimization.

params, kde = hyperopt_by_score(X_train, param_space)

print('Best parameters:')
for key in params:
    print('{}: {}'.format(key, params[key]))

test_scores = -kde.score_samples(X_test)
test_scores = np.where(test_scores == np.inf, np.max(test_scores[np.isfinite(test_scores)])+1, test_scores)

curves = evaluate(y_test, test_scores)
/opt/anaconda3/envs/tfl-training-practical-anomaly-detection/lib/python3.9/site-packages/sklearn/model_selection/_search.py:922: UserWarning: One or more of the test scores are non-finite: [-623.53348226 -605.74955605 -455.44032806 -457.57891538 -523.00064761
 -573.31848479 -585.06775315 -547.55102979 -625.52128401 -687.56008524
 -488.42669585 -663.87539486 -490.39423286 -393.44747378 -475.45274077
 -594.06204868 -492.63439368 -452.12986864 -580.0071599  -495.89016144
 -482.25213518 -481.9648991  -400.69400877 -572.81352273 -501.43232294
 -399.16518928 -532.72597178 -530.86299813 -390.69165379 -597.02597836
 -494.37933174 -600.62363246 -570.19929527 -591.79830774 -521.65908745
          -inf -487.22993216 -632.95641925 -636.63224513 -619.98025567
          -inf -499.93765811 -490.80460304          -inf -553.2088415
 -432.27055747 -499.2496648  -613.47519946 -535.17080827 -388.47444886
 -617.81621648 -430.58312094 -509.60990321 -601.47858781 -522.04977739
 -470.3678377  -499.72287393 -556.75491481 -682.7112003  -564.31138217
 -540.84749294 -524.2436265  -427.16075064 -476.28044822 -639.12644427
 -509.81945574 -489.05434537 -684.34035329 -570.92922592 -650.02889209
 -433.44009742 -608.63887271 -515.93846192          -inf -446.2420344
 -592.23940538 -446.26701455 -454.51319895 -583.74184441 -596.56381917
 -662.07371376 -657.10093574 -554.28866062 -458.77576388 -583.83961792
 -628.74283864 -607.02548877 -384.86240138 -528.022417   -606.2795057
 -576.5179506  -501.16740175 -645.10314322 -613.36114226          -inf
 -498.03175464 -538.22058179 -545.88688467 -537.52331246 -419.0494592 ]
  warnings.warn(
/opt/anaconda3/envs/tfl-training-practical-anomaly-detection/lib/python3.9/site-packages/sklearn/model_selection/_search.py:929: RuntimeWarning: invalid value encountered in subtract
  array_stds = np.sqrt(np.average((array -
Best parameters:
kernel: gaussian
bandwidth: 0.4
visualize_kde(kde, params['bandwidth'], X_test, y_test)

Exercise: Isolate anomalies in house prices

You are a company resposible to estimate house prices around Ames, Iowa, specifically around college area. But there is a problem: houses from a nearby area, 'Veenker', are often included in your dataset. You want to build an anomaly detection algorithm that filters one by one every point that comes from the wrong neighborhood. You have been able to isolate an X_train dataset which, you are sure, contains only houses from College area. Following the previous example, test your ability to isolate anomalies in new incoming data (X_test) with KDE.

Advanced exercise: What happens if the contamination comes from other areas? You can choose among the following names:

OldTown, Veenker, Edwards, MeadowV, Somerst, NPkVill, BrDale, Gilbert, NridgHt, Sawyer, Blmngtn, Blueste

X_train, X_test, y_test = get_house_prices_data(neighborhood = 'CollgCr', anomaly_neighborhood='Veenker')
X_train
LotArea SalePrice OverallCond
0 9179 193000 5
1 10859 145000 5
2 7000 136500 8
3 11049 179900 5
4 9337 204750 5
... ... ... ...
115 9808 227000 5
116 15523 133500 6
117 10970 147000 6
118 8445 133000 7
119 11250 223500 5

120 rows × 3 columns

# Total data
train_test_data = X_train.append(X_test, ignore_index=True)
y_total = [0] * len(X_train) + y_test

fig = px.scatter_3d(train_test_data, x='LotArea', y='OverallCond', z='SalePrice', color=y_total)

fig.show()

Solution

# When data are highly in-homogeneous, like in this case, it is often beneficial 
# to rescale them before applying any anomaly detection or clustering technique.
scaler = MinMaxScaler()
X_train_rescaled = scaler.fit_transform(X_train)
param_space = {
    'kernel': ['gaussian', 'tophat', 'epanechnikov', 'exponential', 'linear', 'cosine'], # Add available kernels
    'bandwidth': np.linspace(0.1, 10, 100), # Define Search space for bandwidth parameter
}
params, kde = hyperopt_by_score(X_train_rescaled, param_space)
/opt/anaconda3/envs/tfl-training-practical-anomaly-detection/lib/python3.9/site-packages/sklearn/model_selection/_search.py:922: UserWarning:

One or more of the test scores are non-finite: [-228.2978916  -174.48341773  -62.81981322 -193.75251657 -178.22833121
 -152.37817126 -170.29345463 -163.08547919  -44.88298831  -98.68326942
 -238.97266469  -90.22412772  -88.1156289  -186.22167848  -92.27331121
 -149.811535   -141.52217795 -201.8590992  -175.10906076 -111.97566863
  -89.71690104  -98.86473442 -210.29038631 -235.85339425 -156.35908044
          -inf -190.13714485  -71.36008916    0.57047052 -160.40786397
 -165.82635351  -60.2138246  -215.02187598 -160.94365706 -143.89871932
 -191.5619423  -184.99209944 -240.48304675 -222.78131195          -inf
 -172.14555475 -199.88431989 -117.00615348 -126.6494825  -113.13033752
 -158.50944084 -226.75650067  -72.58312108 -190.8339942           -inf
 -163.30633779 -222.78107598 -132.9492478  -154.45277808 -242.69082617
 -204.24161797 -194.16053785 -213.17794388 -150.25741669 -212.23798475
  -13.32401378          -inf -158.41708204 -167.38859854  -42.13932623
 -129.67202259 -169.05509806 -207.34508814 -105.30636104          -inf
 -175.29386062          -inf -216.81981658  -33.23551373 -124.3965235
 -196.55734493          -inf  -97.41163609 -130.60719234 -171.38526049
 -125.68210987  -39.7558721     7.54781112 -142.66859394 -129.90590961
 -117.70863511 -187.79217126 -175.50471145  -42.36942839 -176.03874805
          -inf  -94.26637872 -110.74445666 -173.01189639  -61.6345759
 -181.34575068   -6.15734288  -71.26063084 -113.69139875 -241.96239567]

/opt/anaconda3/envs/tfl-training-practical-anomaly-detection/lib/python3.9/site-packages/sklearn/model_selection/_search.py:929: RuntimeWarning:

invalid value encountered in subtract

print('Best parameters:')
for key in params:
    print('{}: {}'.format(key, params[key]))

X_test_rescaled = scaler.transform(X_test)
test_scores = -kde.score_samples(X_test_rescaled)
test_scores = np.where(test_scores == np.inf, np.max(test_scores[np.isfinite(test_scores)])+1, test_scores)
curves = evaluate(y_test, test_scores)
Best parameters:
kernel: cosine
bandwidth: 0.6

The Curse of Dimensionality

The flexibility of KDE comes at a price. The dependency on the dimensionality of the data is quite unfavorable.


Theorem [Stone, 1982] Any estimator that is consistent$^*$ with the class of all $k$-fold differentiable pdfs over $\mathbb{R}^d$ has a convergence rate of at most

$$ \frac{1}{n^{\frac{k}{2k+d}}} $$

$^*$Consistency = for all pdfs $p$ in the class: $\lim_{n\to\infty}|KDE_h(x, D) - p(x)|_\infty = 0$ with probability $1$.

Exercise

  • The very slow convergence in high dimensions does not necessary mean that we will see bad results in high dimensional anomaly detection with KDE.
  • Especially if the anomalies are very outlying.
  • However, in cases where contours of the nominal distribution are non-convex we can run into problems.

We take a look at a higher dimensional version of out previous data set.

dists = create_distributions(dim=3)

distribution_with_anomalies = contamination(
    nominal=dists['Sinusoidal'],
    anomaly=dists['Blob'],
    p=0.01
)

sample = distribution_with_anomalies.sample(500)

y = sample[0]
X = sample[-1]
fig = px.scatter_3d(x=X[:, 0], y=X[:, 1], z=X[:, 2], color=y)
fig.show()
# Fit KDE on high dimensional examples 
rocs = []
auprs = []
bandwidths = []

param_space = {
        'kernel': ['gaussian'],
        'bandwidth': np.linspace(0.1, 100, 1000), # Define Search space for bandwidth parameter
    }

kdes = {}
dims = np.arange(2,16)
for d in tqdm(dims):
    # Generate d dimensional distributions
    dists = create_distributions(dim=d)

    distribution_with_anomalies = contamination(
        nominal=dists['Sinusoidal'],
        anomaly=dists['Blob'],
        p=0
    )

    # Train on clean data
    sample_train = dists['Sinusoidal'].sample(500)
    X_train = sample_train[-1].numpy()
    # Test data
    sample_test = distribution_with_anomalies.sample(500)
    X_test = sample_test[-1].numpy()
    y_test = sample_test[0].numpy()

    # Optimize bandwidth
    params, kde = hyperopt_by_score(X_train, param_space)
    kdes[d] = (params, kde)
    
    bandwidths.append(params['bandwidth'])

    test_scores = -kde.score_samples(X_test)
    test_scores = np.where(test_scores == np.inf, np.max(test_scores[np.isfinite(test_scores)])+1, test_scores)

    
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 14/14 [00:35<00:00,  2.56s/it]
# Plot cross section of pdf 
fig, axes = plt.subplots(nrows=2, ncols=7, figsize=(15, 5))
for d, axis in tqdm(list(zip(kdes, axes.flatten()))):
    
    params, kde = kdes[d]

    lin = np.linspace(-10, 10, 50)
    grid_points = list(it.product(*([[0]]*(d-2)), lin, lin))
    ys, xs = np.meshgrid(lin, lin)
    # The score function of sklearn returns log-densities
    scores = np.exp(kde.score_samples(grid_points)).reshape(50, 50)
    colormesh = axis.contourf(xs, ys, scores)
    axis.set_title("Dim = {}".format(d))
    axis.set_aspect('equal')
    

# Plot evaluation
print('Crossection of the KDE at (0,...,0, x, y)')
plt.show()
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 14/14 [00:01<00:00,  7.97it/s]
Crossection of the KDE at (0,...,0, x, y)

Robustness

Another drawback of KDE in the context of anomaly detection is that it is not robust against contamination of the data


Definition The breakdown point of an estimator is the smallest fraction of observations that need to be changed so that we can move the estimate arbitrarily far away from the true value.


Example: The sample mean has a breakdown point of $0$. Indeed, for a sample of $x_1,\ldots, x_n$ we only need to change a single value in order to move the sample mean in any way we want. That means that the breakdown point is smaller than $\frac{1}{n}$ for every $n\in\mathbb{N}$.

Robust Statistics

There are robust replacements for the sample mean:

  • Median of means: Split the dataset into $S$ equally sized subsets $X_1,\ldots, X_S$ and compute $\mathrm{median}(\overline{X_1},\ldots, \overline{X_S})$
  • M-estimation: The mean in a normed vector space is the value that minimizes the squared distances
    $\overline{X} = \min_{y}\sum_{x\in X}|x-y|^2$
    M-estimation replaces the quadratic loss with a more robust loss function.

Huber loss

Switch from quadratic to linear loss at prescribed threshold

import numpy as np


def huber(error, threshold):
    test = (np.abs(error) <= threshold)
    return (test * (error**2)/2) + ((1-test)*threshold*(np.abs(error) - threshold/2))

x = np.linspace(-5, 5)
y = huber(x, 1)

plt.plot(x, y)
plt.gca().set_title("Huber Loss")
plt.show()

Hampel loss

More complex loss function. Depends on 3 parameters 0 < a < b< r

import numpy as np

def single_point_hampel(error, a, b, r):
    if abs(error) <= a:
        return error**2/2
    elif a < abs(error) <= b:
        return (1/2 *a**2 + a* (abs(error)-a))
    elif  b < abs(error) <= r:
        return a * (2*b-a+(abs(error)-b) * (1+ (r-abs(error))/(r-b)))/2
    else:
        return a*(b-a+r)/2

hampel = np.vectorize(single_point_hampel)

x = np.linspace(-10.1, 10.1)
y = hampel(x, a=1.5, b=3.5, r=8)

plt.plot(x, y)
plt.gca().set_title("Hampel Loss")
plt.show()

KDE is a Mean


Kernel as scalar product:

  • Let $K$ be a radial monotonic$^\ast$ kernel over $\mathbb{R}^n$.
  • For $x\in\mathbb{R}^n$ let $\phi_x = K(\cdot, x)$.
  • Vector space over the linear span of $\{\phi_x \mid x\in\mathbb{R}^n\}$:
    • Pointwise addition and scalar multiplication.
  • Define the scalar product $\langle \phi_x, \phi_y\rangle = K(x,y)$.
  • Advantage: Scalar product is computable
  • Call this the reproducing kernel Hilbert space (RKHS) of $K$.
  • $\mathrm{KDE}_h(\cdot, D) = \frac{1}{N}\sum_{i=1}^N K_h(\cdot, x_i) = \frac{1}{N}\sum_{i=1}^N\phi_{x_i}$
    • where $K_h(x,y) = \frac{1}{h}K\left(\frac{|x-y|}{h}\right)$

$^*$All kernels that we have seen are radial and monotonic

Exercise

We compare the performance of different approaches to recover the nominal distribution under contamination. Here, we use code by Humbert et al. to replicate the results in the referenced paper on median-of-mean KDE. More details on rKDE can instead be found in this paper by Kim and Scott.

# =======================================================
#   Parameters
# =======================================================
algos = [
    'kde',
    'mom-kde', # Median-of-Means
    'rkde-huber', # robust KDE with huber loss
    'rkde-hampel', # robust KDE with hampel loss
]

dataset = 'house-prices'
dataset_options = {'neighborhood': 'CollgCr', 'anomaly_neighborhood': 'Edwards'}

outlierprop_range = [0.01, 0.02, 0.03, 0.05, 0.07, 0.1, 0.2, 0.3, 0.4, 0.5]
kernel = 'gaussian'
auc_scores = perform_rkde_experiment(
    algos,
    dataset,
    dataset_options,
    outlierprop_range,
    kernel,
)
Dataset:  house-prices
Outlier prop: 0.01 (1 / 10)
downsample outliers
Finding best bandwidth...
Algo:  kde
Algo:  mom-kde
Algo:  rkde-huber
Stop at 4 iterations
Stop at 2 iterations
Algo:  rkde-hampel
Stop at 4 iterations
Stop at 100 iterations

Outlier prop: 0.02 (2 / 10)
downsample outliers
Finding best bandwidth...
Algo:  kde
Algo:  mom-kde
Algo:  rkde-huber
Stop at 4 iterations
Stop at 2 iterations
Algo:  rkde-hampel
Stop at 4 iterations
Stop at 100 iterations

Outlier prop: 0.03 (3 / 10)
downsample outliers
Finding best bandwidth...
Algo:  kde
Algo:  mom-kde
Algo:  rkde-huber
Stop at 3 iterations
Stop at 2 iterations
Algo:  rkde-hampel
Stop at 3 iterations
Stop at 100 iterations

Outlier prop: 0.05 (4 / 10)
downsample outliers
Finding best bandwidth...
Algo:  kde
Algo:  mom-kde
Algo:  rkde-huber
Stop at 3 iterations
Stop at 2 iterations
Algo:  rkde-hampel
Stop at 3 iterations
Stop at 100 iterations

Outlier prop: 0.07 (5 / 10)
downsample outliers
Finding best bandwidth...
Algo:  kde
Algo:  mom-kde
Algo:  rkde-huber
Stop at 4 iterations
Stop at 3 iterations
Algo:  rkde-hampel
Stop at 4 iterations
Stop at 10 iterations

Outlier prop: 0.1 (6 / 10)
downsample outliers
Finding best bandwidth...
Algo:  kde
Algo:  mom-kde
Algo:  rkde-huber
Stop at 3 iterations
Stop at 2 iterations
Algo:  rkde-hampel
Stop at 3 iterations
Stop at 100 iterations

Outlier prop: 0.2 (7 / 10)
downsample outliers
Finding best bandwidth...
Algo:  kde
Algo:  mom-kde
Algo:  rkde-huber
Stop at 5 iterations
Stop at 2 iterations
Algo:  rkde-hampel
Stop at 5 iterations
Stop at 100 iterations

Outlier prop: 0.3 (8 / 10)
downsample outliers
Finding best bandwidth...
Algo:  kde
Algo:  mom-kde
Algo:  rkde-huber
Stop at 4 iterations
Stop at 2 iterations
Algo:  rkde-hampel
Stop at 4 iterations
Stop at 100 iterations

Outlier prop: 0.4 (9 / 10)
downsample outliers
Finding best bandwidth...
Algo:  kde
Algo:  mom-kde
Algo:  rkde-huber
Stop at 3 iterations
Stop at 2 iterations
Algo:  rkde-hampel
Stop at 3 iterations
Stop at 100 iterations

Outlier prop: 0.5 (10 / 10)
downsample inliers
Finding best bandwidth...
Algo:  kde
Algo:  mom-kde
Algo:  rkde-huber
Stop at 3 iterations
Stop at 2 iterations
Algo:  rkde-hampel
Stop at 3 iterations
Stop at 100 iterations
fig, ax = plt.subplots(figsize=(7, 5))
for algo, algo_data in auc_scores.groupby('algo'):
    x = algo_data.groupby('outlier_prop').mean().index
    y = algo_data.groupby('outlier_prop').mean()['auc_anomaly']
    ax.plot(x, y, 'o-', label=algo)
plt.legend()
plt.xlabel('outlier_prop')
plt.ylabel('auc_score')
plt.title('Comparison of rKDE against contamination')
Text(0.5, 1.0, 'Comparison of rKDE against contamination')

Try using different neighborhoods for contamination. Which robust KDE algorithm performs better overall? Choose among the following options:

OldTown, Veenker, Edwards, MeadowV, Somerst, NPkVill, BrDale, Gilbert, NridgHt, Sawyer, Blmngtn, Blueste

You can also change the kernel type: gaussian, tophat, epechenikov, exponential, linear or cosine,

Summary

  • Kernel density estimation is a non-parametric method to estimate a pdf from a sample.
  • Bandwidth is the most important parameter.
  • Converges to the true pdf if $n\to\infty$.
    • Convergence exponentially depends on the dimension.
  • KDE is sensitive to contamination:
    • In a contaminated setting one can employ methods from robust statistics to obtain robust estimates.

Implementations

Anomaly Detection via Isolation

Idea: An anomaly should allow "simple" descriptions that distinguish it from the rest of the data.

  • Descriptions: Conjunction of single attribute tests, i.e. $X_i \leq c$ or $X_i > c$.
  • Example: $X_1 \leq 1.2 \text{ and } X_5 > -3.4 \text{ and } X_7 \leq 5.6$.
  • Complexity of description: Number of conjunctions.

Moreover, we assume that a short random descriptions will have a significantly larger chance of isolating an anomaly than isolating any nominal point.

  • Choose random isolating descriptions and compute anomaly score from average complexity.

Isolation Tree

Isolation Forest (iForest) implements this idea by generating an ensemble of random decision trees. Each tree is built as follows:


Input: Data set (subsample) $X$, maximal height $h$

  • Randomly choose feature $i$ and split value $s$ (in range of data)
  • Recursively build subtrees on $X_L = \{x\in X\mid x_i \leq s\}$ and $X_R = X\setminus X_L$
  • Stop if remaining data set $ \leq 1$ or maximal height reached
  • Store test $x_i\leq s$ for inner nodes and $|X|$ for leaf nodes

Visualization

Isolation Tree as Partition Diagram

Isolation Depth


Input: Observation $x$

  • ${\ell} = $ length of path from root to leaf according to tests
  • ${n} = $ size of remaining data set in leaf node
  • ${c(n)} =$ expected length of a path in a BST with $n$ nodes $={O}(\log n)$
  • ${h(x)} = \ell + c(n)$

Isolation Depth of Outlier (red) and nominal (blue)

Isolation Forest

  • Train $k$ isolation trees on subsamples of size $N$
Isolation depth of nominal point (left) and outlier (right)

Variants of Isolation Forest

Variant: Random Robust Cut Forest

New Rule to Choose Split Test:

  • $\ell_i$: length of the $i$th component of the bounding box around current data set
  • Choose dimension $i$ with probability $\frac{\ell_i}{\sum_j \ell_j}$
  • More robust against "noise dimensions"

Variant: Extended Isolation Forest

New split criterion:

  • Uniformly choose a normal and an orthogonal hyperplane through the data
  • Removes a bias that was empirically observed when plotting the outlier score of iForest on low dimensional data sets

Exercise: Network Security

In the final exercise of today you will have to develop an anomaly detection system for network traffic.

Briefing

A large e-commerce company A is experiencing downtime due to attacks on their infrastructure. You were instructed to develop a system that can detect malicious connections to the infrastructure. It is planned that suspicious clients will be banned.

Another data science team already prepared the connection data of the last year for you. They also separated a test set and manually identified and labeled attacks in that data.

The Data

We will work on a version of the classic KDD99 data set.

Kddcup 99 Data Set


The KDD Cup '99 dataset was created by processing the tcp dump portions of the 1998 DARPA Intrusion Detection System (IDS) Evaluation dataset, created by MIT Lincoln Lab [1]. The artificial data (described on the dataset's homepage <https://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html>_) was generated using a closed network and hand-injected attacks to produce a large number of different types of attack with normal activity in the background.

=========================   ====================================================
Samples total               976158
Dimensionality              41
Features                    string (str), discrete (int), continuous (float)
Targets                     str, 'normal.' or name of the anomaly type
Proportion of Anomalies     1%
=========================   ====================================================


Task

You will have to develop the system on your own. In particular, you will have to

  • Explore the data.
  • Choose an algorithm.
  • Find a good detection threshold.
  • Evaluate and summarize your results.
  • Estimate how much A could save through the use of your system.
X_train,X_test,y_test = get_kdd_data()

Explore Data

#
# Add your exploration code
#
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)
# get description
X_train.describe()
0 1 2 3 4 5 6 7 8 9 ... 31 32 33 34 35 36 37 38 39 40
count 80524 80524 80524 80524 80524 80524 80524 80524 80524 80524 ... 80524 80524 80524.0 80524.0 80524.0 80524.0 80524.0 80524.0 80524.0 80524.0
unique 2028 3 50 10 3028 9750 2 3 2 19 ... 256 256 101.0 97.0 101.0 53.0 88.0 50.0 101.0 101.0
top 0 b'tcp' b'http' b'SF' 105 0 0 0 0 0 ... 255 255 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
freq 71096 62133 49442 75375 5928 13965 80523 80509 80523 80064 ... 34715 45956 52132.0 51497.0 27037.0 40649.0 76009.0 75569.0 72971.0 73038.0

4 rows × 41 columns

# get better description
X_train.drop(columns=[1,2,3]).astype(float).describe()
0 4 5 6 7 8 9 10 11 12 ... 31 32 33 34 35 36 37 38 39 40
count 80524.000000 8.052400e+04 8.052400e+04 80524.000000 80524.000000 80524.000000 80524.000000 80524.000000 80524.000000 80524.000000 ... 80524.000000 80524.000000 80524.000000 80524.000000 80524.000000 80524.000000 80524.000000 80524.000000 80524.000000 80524.000000
mean 213.101038 1.259843e+03 3.334803e+03 0.000012 0.000484 0.000037 0.045514 0.000224 0.694687 0.034561 ... 152.240438 201.341290 0.840585 0.056097 0.154358 0.023282 0.009658 0.008620 0.056944 0.055145
std 1342.005913 4.197120e+04 3.920496e+04 0.003524 0.037125 0.010572 0.871144 0.022837 0.460543 4.448219 ... 103.498895 88.014214 0.311452 0.179133 0.307652 0.048968 0.091206 0.087943 0.223654 0.217902
min 0.000000 0.000000e+00 0.000000e+00 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.000000 1.460000e+02 1.050000e+02 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 40.000000 169.000000 0.910000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
50% 0.000000 2.320000e+02 3.920000e+02 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 ... 175.000000 255.000000 1.000000 0.000000 0.010000 0.000000 0.000000 0.000000 0.000000 0.000000
75% 0.000000 3.170000e+02 2.012250e+03 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 ... 255.000000 255.000000 1.000000 0.010000 0.090000 0.030000 0.000000 0.000000 0.000000 0.000000
max 36570.000000 5.133876e+06 5.134218e+06 1.000000 3.000000 3.000000 30.000000 4.000000 1.000000 884.000000 ... 255.000000 255.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000

8 rows × 38 columns

# Check for NaNs
print("Number of NaNs: {}".format(X_train.isna().sum().sum()))
Number of NaNs: 0
#
# Add your preperation code here
#

# Encode string features
binarizer = LabelBinarizer()
one_hots = None
one_hots_test = None
for i in [1, 2, 3]:
    binarizer.fit(X_train[[i]].astype(str))
    if one_hots is None:
        one_hots = binarizer.transform(X_train[[i]].astype(str))
        one_hots_test = binarizer.transform(X_test[[i]].astype(str))
    else:
        one_hots = np.concatenate([one_hots, binarizer.transform(X_train[[i]].astype(str))], axis=1)
        one_hots_test = np.concatenate([one_hots_test, binarizer.transform(X_test[[i]].astype(str))], axis=1)

X_train.drop(columns=[1,2,3], inplace=True)
X_train_onehot = pd.DataFrame(np.concatenate([X_train.values, one_hots], axis=1))

X_test.drop(columns=[1,2,3], inplace=True)
X_test_onehot = pd.DataFrame(np.concatenate([X_test.values, one_hots_test], axis=1))
# Encode y
y_test_bin = np.where(y_test == b'normal.', 0, 1)
# Remove suspicious data
# This step is not strictly neccessary but can improve performance
suspicious = X_train_onehot.apply(lambda col: (col - col.mean()).abs() > 4 * col.std() if col.std() > 1 else False)
suspicious = suspicious.any(axis=1)# 4 sigma rule
print('filtering {} suspicious data points'.format(suspicious.sum()))
X_train_clean = X_train_onehot[~suspicious]
filtering 2949 suspicious data points

Summary

  • Isolation Forest empirically shows very good performance up to relatively high dimensions
  • It is relatively robust against contamination
  • Usually little need for hyperparameter tuning

Implementations

Choose Algorithm

# TODO: implement proper model selection
iforest = IsolationForest()
iforest.fit(X_train_clean)
IsolationForest()
# find best threshold
X_test_onehot, X_val_onehot, y_test_bin, y_val_bin = train_test_split(X_test_onehot, y_test_bin, test_size=.5)
y_score = -iforest.score_samples(X_val_onehot).reshape(-1)

Evaluate Solution

#
# Insert evaluation code
#

# calculate scores if any anomaly is present
if np.any(y_val_bin == 1):
    eval = evaluate(y_val_bin, y_score)
    prec, rec, thr = eval['PR']
    f1s = 2 * (prec * rec)/(prec + rec)
    threshold = thr[np.argmax(f1s)]

    y_score = -iforest.score_samples(X_test_onehot).reshape(-1)
    y_pred = np.where(y_score < threshold, 0, 1)

    print('Precision: {}'.format(metrics.precision_score(y_test_bin, y_pred)))
    print('Recall: {}'.format(metrics.recall_score(y_test_bin, y_pred)))
    print('F1: {}'.format(metrics.f1_score(y_test_bin, y_pred)))
Precision: 0.5292153589315526
Recall: 0.9814241486068112
F1: 0.6876355748373103

Anomaly Detection via Reconstruction Error

Idea: Embed the data into low dimensional space and reconstruct it again. Good embedding of nominal data $\Rightarrow$ high reconstruction error indicates anomaly.

Autoencoder:

  • Parametric family of encoders: $f_\phi: \mathbb{R}^d \to \mathbb{R}^{\text{low}}$
  • Parametric family of decoders: $g_\theta: \mathbb{R}^{\text{low}} \to \mathbb{R}^{d}$
  • Reconstruction error of $(f_\phi, g_\theta)$ on $x$: $|x - g_\theta(f_\phi(x))|$
  • Given data set $D$, find $\phi,\theta$ that minimize $\sum_{x\in D} L(|x- g_\theta(f_\phi(x))|) $ for some loss function $L$.

Visualization

Autoencoder Schema

Neural Networks

Neural networks are very well suited for finding low dimensional representations of data. Hence they are a popular choice for the encoder and the decoder.


Artificial Neuron with $N$ inputs: $y = \sigma\left(\sum_i^N w_i X_i + b\right)$

  • $\sigma$: nonlinear activation-function (applied component wise).
  • $b$ bias

Isolation depth of nominal point and anomaly

Neural Networks

Neural networks combine many artificial neurons into a complex network. These networks are usually organized in layers where the result of each layer is the input for the next layer. Some commonly used layers are:

Variational Autoencoders

An important extension of autoencoders that relates the idea to density estimation. More precisely, we define a generative model for our data using latent variables and combine the maximum likelihood estimation of the parameters with a simultaneous posterior estimation of the latents through amortized stochastic variational inference. We use a decoder network to transform the latent variables into the data distribution, and an encoder network to compute the posterior distribution of the latents given the data.


Definition: The model uses an observed variable $X$ (the data) and a latent variable $Z$ (the defining features of $X$). We assume both $P(Z)$ and $P(X\mid Z)$ to be normally distributed. More precisely

  • $P(Z) = \mathcal{N}(0, I)$
  • $P(X\mid Z) = \mathcal{N}(\mu_\phi(Z), I)$

where $\mu_\phi$ is a neural network parametrized with $\phi$. We use variational inference to perform posterior inference on $Z$ given $X$. We assume that the distribution $P(Z\mid X)$ to be relatively well approximated by a Gaussian and use the posterior approximation:

  • $q(X\mid Z) = \mathcal{N}(\mu_\psi(X), \sigma_\psi(X))$

$\mu_\psi$ and $\sigma_\psi$ are neural networks parameterized with $\psi$


Given a data set $D$ we minimize the (amortized) Kullback-Leibler divergence between our posterior approximation and the true posterior: \begin{align*} D_{KL}(q(z\mid x),p(z\mid x)) &= E_{x\sim X, z\sim q(Z\mid x)}\left[\log\left(\frac{q(z \mid x)}{p(z \mid X)}\right)\right] \\ &= E_{x\sim X, z\sim q(Z\mid X)}\left[\log\left(\frac{q(z \mid x)}{\frac{p(x \mid z)p(z)}{p(x)}}\right)\right] \\ &= E_{x\sim X, z\sim q(Z\mid x)}\left[\log\left(\frac{q(z \mid x)}{p(x \mid z)p(z)}\right) + \log(p(x))\right] \\ &= E_{x\sim X, z\sim q(Z\mid x)}\left[\log\left(\frac{q(z \mid x)}{p(x \mid z)p(z)}\right)\right] + E_{x\sim X}[\log(p(x))]\\ \end{align*}

Now we can define

\begin{align*} \mathrm{ELBO}(q(z\mid x),p(z\mid x)) &:= E_{x\sim X}[\log(p(x))] - D_{KL}(q(z\mid x),p(z\mid x)) \\ &= -E_{x\sim X, z\sim q(Z\mid x)}\left[\log\left(\frac{q(z \mid x)}{p(x \mid z)p(z)}\right)\right] \end{align*}

Note that we can evaluate the expression inside the expectation of the final RHS of the equation and we can obtain unbiased estmates of the expectation via sampling. Let us further try to understand the ELBO as an optimization objective. On one hand, maximizing the ELBO with respect to the parameters in $q$ is equivalent to minimizing the KL divergence between $p$ and $q$. On the other hand, maximizing the ELBO with respect to the parameters in $p$ can be understood as raising a lower bound for the likelihood of the generative model $p(x)$. Hence, the optimization tries to find an encoder and a decoder pair such that it simultaneously provides a good generative explanation of the data and a good approximation of the posterior distribution of the latent variables.

Exercise

The MNIST Data Set

MNIST is one of the most iconic data sets in the history of machine learning. It contains 70000 samples of $28\times 28$ grayscale images of handwritten digits. Because of its moderate complexity and good visualizability it is well suited to study the behavior of machine learning algorithms in higher dimensional spaces.

While originally created for classification (optical character recognition), we can build an anomaly detection data set by corrupting some of the images.

Pre-processing

We first need to obtain the MNIST data set and prepare an anomaly detection set from it. Note that the data set is n row vector format. Therefore, we work with $28\times 28 = 784$ dimensional data points.

Load MNIST Data Set

mnist = get_mnist_data()

data = mnist['data']
print('data.shape: {}'.format(data.shape))
target = mnist['target'].astype(int)
data.shape: (70000, 784)

Build contaminated Data Sets

We prepared a function that does the job for us. It corrupts a prescribed portion of the data by introducing a rotation, noise or a blackout of some part of the image.

First, we need to transform the data into image format.

X = data.reshape(-1, 28, 28, 1)/255

Train/Test-Split

We will only corrupt the test set, hence we will perform the train-test split beforehand. We separate a relatively small test set so that we can use as much as possible from the data to obtain high quality representations.

test_size = .1
X_train, X_test, target_train, target_test = train_test_split(X, target, test_size=test_size)
X_test, y_test = build_contaminated_minst(X_test)

# Visualize contamination
anomalies = X_test[y_test != 0]
selection = np.random.choice(len(anomalies), 25)

fig, axes = plt.subplots(nrows=5, ncols=5, figsize=(5, 5))
for img, ax in zip(anomalies[selection], axes.flatten()):
    ax.imshow(img, 'gray')
    ax.axis('off')
plt.show()

Autoencoder

Let us finally train an autoencoder model. We replicate the model given in the Keras documentation and apply it in a synthetic outlier detection scenario based on MNIST.

in the vae package we provide the implementation of the VAE. Please take a look into the source code to see how the minimization of the KL divergence is implemented.

Create Model

latent_dim = 3
vae = VAE(decoder=build_decoder_mnist(latent_dim=latent_dim), encoder=build_encoder_minst(latent_dim=latent_dim))
## Inspect model architecture
vae.encoder.summary()
Model: "encoder"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
==================================================================================================
input_2 (InputLayer)            [(None, 28, 28, 1)]  0                                            
__________________________________________________________________________________________________
conv2d (Conv2D)                 (None, 14, 14, 32)   320         input_2[0][0]                    
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 7, 7, 64)     18496       conv2d[0][0]                     
__________________________________________________________________________________________________
flatten (Flatten)               (None, 3136)         0           conv2d_1[0][0]                   
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 16)           50192       flatten[0][0]                    
__________________________________________________________________________________________________
z_mean (Dense)                  (None, 3)            51          dense_1[0][0]                    
__________________________________________________________________________________________________
z_log_var (Dense)               (None, 3)            51          dense_1[0][0]                    
__________________________________________________________________________________________________
sampling (Sampling)             (None, 3)            0           z_mean[0][0]                     
                                                                 z_log_var[0][0]                  
==================================================================================================
Total params: 69,110
Trainable params: 69,110
Non-trainable params: 0
__________________________________________________________________________________________________
## Inspect model architecture
vae.decoder.summary()
Model: "decoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
input_1 (InputLayer)         [(None, 3)]               0         
_________________________________________________________________
dense (Dense)                (None, 3136)              12544     
_________________________________________________________________
reshape (Reshape)            (None, 7, 7, 64)          0         
_________________________________________________________________
conv2d_transpose (Conv2DTran (None, 14, 14, 64)        36928     
_________________________________________________________________
conv2d_transpose_1 (Conv2DTr (None, 28, 28, 32)        18464     
_________________________________________________________________
conv2d_transpose_2 (Conv2DTr (None, 28, 28, 1)         289       
=================================================================
Total params: 68,225
Trainable params: 68,225
Non-trainable params: 0
_________________________________________________________________
# train model
n_epochs = 30

vae.compile(optimizer=keras.optimizers.Adam(learning_rate=.001))
history = vae.fit(X_train, epochs=n_epochs, batch_size=128)
2023-03-27 18:01:18.104746: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
Epoch 1/30
493/493 [==============================] - 37s 74ms/step - loss: 43.2125 - reconstruction_loss: 35.0796 - kl_loss: 0.5526
Epoch 2/30
493/493 [==============================] - 37s 74ms/step - loss: 32.0381 - reconstruction_loss: 30.6512 - kl_loss: 1.1567
Epoch 3/30
493/493 [==============================] - 37s 76ms/step - loss: 31.5116 - reconstruction_loss: 30.1268 - kl_loss: 1.3270
Epoch 4/30
493/493 [==============================] - 37s 75ms/step - loss: 31.4865 - reconstruction_loss: 29.9048 - kl_loss: 1.4239
Epoch 5/30
493/493 [==============================] - 37s 75ms/step - loss: 31.3181 - reconstruction_loss: 29.7411 - kl_loss: 1.4881
Epoch 6/30
493/493 [==============================] - 41s 83ms/step - loss: 31.2195 - reconstruction_loss: 29.5245 - kl_loss: 1.5883
Epoch 7/30
493/493 [==============================] - 40s 82ms/step - loss: 31.0206 - reconstruction_loss: 29.2586 - kl_loss: 1.6984
Epoch 8/30
493/493 [==============================] - 38s 78ms/step - loss: 30.9019 - reconstruction_loss: 29.0013 - kl_loss: 1.8064
Epoch 9/30
493/493 [==============================] - 38s 77ms/step - loss: 30.7257 - reconstruction_loss: 28.7885 - kl_loss: 1.8978
Epoch 10/30
493/493 [==============================] - 42s 86ms/step - loss: 30.6760 - reconstruction_loss: 28.5764 - kl_loss: 1.9740
Epoch 11/30
493/493 [==============================] - 45s 92ms/step - loss: 30.5168 - reconstruction_loss: 28.3831 - kl_loss: 2.0583
Epoch 12/30
493/493 [==============================] - 40s 81ms/step - loss: 30.3570 - reconstruction_loss: 27.8616 - kl_loss: 2.3245
Epoch 13/30
493/493 [==============================] - 38s 77ms/step - loss: 29.8790 - reconstruction_loss: 26.8904 - kl_loss: 2.8025
Epoch 14/30
493/493 [==============================] - 38s 78ms/step - loss: 29.5295 - reconstruction_loss: 26.4945 - kl_loss: 2.9482
Epoch 15/30
493/493 [==============================] - 38s 76ms/step - loss: 29.4556 - reconstruction_loss: 26.2875 - kl_loss: 3.0372
Epoch 16/30
493/493 [==============================] - 36s 72ms/step - loss: 29.3284 - reconstruction_loss: 26.1456 - kl_loss: 3.1053
Epoch 17/30
493/493 [==============================] - 35s 72ms/step - loss: 29.2414 - reconstruction_loss: 26.0303 - kl_loss: 3.1343
Epoch 18/30
493/493 [==============================] - 39s 80ms/step - loss: 29.1804 - reconstruction_loss: 25.9409 - kl_loss: 3.1589
Epoch 19/30
493/493 [==============================] - 36s 73ms/step - loss: 29.1137 - reconstruction_loss: 25.8674 - kl_loss: 3.2011
Epoch 20/30
493/493 [==============================] - 35s 71ms/step - loss: 29.0784 - reconstruction_loss: 25.8001 - kl_loss: 3.2285
Epoch 21/30
493/493 [==============================] - 35s 71ms/step - loss: 29.0486 - reconstruction_loss: 25.7555 - kl_loss: 3.2475
Epoch 22/30
493/493 [==============================] - 35s 70ms/step - loss: 29.0516 - reconstruction_loss: 25.6895 - kl_loss: 3.2780
Epoch 23/30
493/493 [==============================] - 37s 76ms/step - loss: 28.9762 - reconstruction_loss: 25.6416 - kl_loss: 3.2959
Epoch 24/30
493/493 [==============================] - 35s 71ms/step - loss: 28.9372 - reconstruction_loss: 25.6033 - kl_loss: 3.3167
Epoch 25/30
493/493 [==============================] - 35s 71ms/step - loss: 28.8955 - reconstruction_loss: 25.5595 - kl_loss: 3.3314
Epoch 26/30
493/493 [==============================] - 36s 73ms/step - loss: 28.8910 - reconstruction_loss: 25.5122 - kl_loss: 3.3445
Epoch 27/30
493/493 [==============================] - 35s 70ms/step - loss: 28.8985 - reconstruction_loss: 25.4727 - kl_loss: 3.3673
Epoch 28/30
493/493 [==============================] - 36s 74ms/step - loss: 28.8594 - reconstruction_loss: 25.4325 - kl_loss: 3.3911
Epoch 29/30
493/493 [==============================] - 35s 71ms/step - loss: 28.8624 - reconstruction_loss: 25.3945 - kl_loss: 3.4032
Epoch 30/30
493/493 [==============================] - 36s 72ms/step - loss: 28.8374 - reconstruction_loss: 25.3717 - kl_loss: 3.4053

Inspect Result

import matplotlib.pyplot as plt


def plot_latent_space(vae, n=10, figsize=10):
    for perm in [[0, 1, 2], [1, 2, 0], [2, 1, 0]]:
        # display a n*n 2D manifold of digits
        digit_size = 28
        scale = 1.0
        figure = np.zeros((digit_size * n, digit_size * n))
        # linearly spaced coordinates corresponding to the 2D plot
        # of digit classes in the latent space
        grid_x = np.linspace(-scale, scale, n)
        grid_y = np.linspace(-scale, scale, n)[::-1]

        for i, yi in enumerate(grid_y):
            for j, xi in enumerate(grid_x):
                z_sample = np.array([[xi, yi, 0]])
                z_sample[0] = z_sample[0][perm]
                x_decoded = vae.decoder.predict(z_sample)
                digit = x_decoded[0].reshape(digit_size, digit_size)
                figure[
                    i * digit_size : (i + 1) * digit_size,
                    j * digit_size : (j + 1) * digit_size,
                ] = digit

        plt.figure(figsize=(figsize, figsize))
        start_range = digit_size // 2
        end_range = n * digit_size + start_range
        pixel_range = np.arange(start_range, end_range, digit_size)
        sample_range_x = np.round(grid_x, 1)
        sample_range_y = np.round(grid_y, 1)
        plt.xticks(pixel_range, sample_range_x)
        plt.yticks(pixel_range, sample_range_y)
        plt.xlabel("z[{}]".format(perm[0]))
        plt.ylabel("z[{}]".format(perm[1]))
        plt.gca().set_title('z[{}] = 0'.format(perm[2]))
        plt.imshow(figure, cmap="Greys_r")
        plt.show()
plot_latent_space(vae)
# Principal components
pca = PCA()
latents = vae.encoder.predict(X_train)[2]
pca.fit(latents)

kwargs = {'x_{}'.format(i): (-1., 1.) for i in range(latent_dim)}


@widgets.interact(**kwargs)
def explore_latent_space(**kwargs):
    center_img = pca.transform(np.zeros([1,latent_dim]))

    latent_rep_pca =  center_img + np.array([[kwargs[key] for key in kwargs]])
    latent_rep = pca.inverse_transform(latent_rep_pca)
    img = vae.decoder(latent_rep).numpy().reshape(28, 28)

    fig, ax = plt.subplots()
    ax.axis('off')
    ax.axis('off')

    ax.imshow(img,cmap='gray', vmin=0, vmax=1)
    plt.show()
latents = vae.encoder.predict(X_train)[2]
scatter = px.scatter_3d(x=latents[:, 0], y=latents[:, 1], z=latents[:, 2], color=target_train)

scatter.show()
latents = vae.encoder.predict(X_test)[2]
scatter = px.scatter_3d(x=latents[:, 0], y=latents[:, 1], z=latents[:, 2], color=y_test)

scatter.show()
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test)
n_samples = 10

s = np.random.choice(range(len(X_val)), n_samples)
s = X_val[s]
#s = [X_train_img[i] for i in s]

fig, axes = plt.subplots(nrows=2, ncols=n_samples, figsize=(10, 2))
for img, ax_row in zip(s, axes.T):
    x = vae.decoder.predict(vae.encoder.predict(img.reshape(1, 28, 28, 1))[2]).reshape(28, 28)
    diff = x - img.reshape(28, 28)
    error = (diff * diff).sum()
    ax_row[0].axis('off')
    ax_row[1].axis('off')
    ax_row[0].imshow(img,cmap='gray', vmin=0, vmax=1)
    ax_row[1].imshow(x, cmap='gray', vmin=0, vmax=1)
    ax_row[1].set_title('E={:.1f}'.format(error))

plt.tight_layout()
plt.show()
from sklearn import metrics
y_test_bin = y_test.copy()
y_test_bin[y_test != 0] = 1
y_val_bin = y_val.copy()
y_val_bin[y_val != 0] = 1
# Evaluate
reconstruction = vae.decoder.predict(vae.encoder(X_val)[2])
rerrors = (reconstruction - X_val).reshape(-1, 28*28)
rerrors = (rerrors * rerrors).sum(axis=1)

# Let's calculate scores if any anomaly is present
if np.any(y_val_bin == 1):
    eval = evaluate(y_val_bin.astype(int), rerrors.astype(float))
    pr, rec, thr = eval['PR']
    f1s = (2 * ((pr * rec)[:-1]/(pr + rec)[:-1]))
    threshold = thr[np.argmax(f1s)]
    print('Optimal threshold: {}'.format(threshold))

    reconstruction = vae.decoder.predict(vae.encoder(X_test)[2])
    reconstruction_error = (reconstruction - X_test).reshape(-1, 28*28)
    reconstruction_error = (reconstruction_error * reconstruction_error).sum(axis=1)


    classification = (reconstruction_error > threshold).astype(int)

    print('Precision: {}'.format(metrics.precision_score(y_test_bin, classification)))
    print('Recall: {}'.format(metrics.recall_score(y_test_bin, classification)))
    print('F1: {}'.format(metrics.f1_score(y_test_bin, classification)))

    metrics.confusion_matrix(y_test_bin, classification)
else:
    reconstruction_error = None
Optimal threshold: 90.2568802068335
Precision: 0.8059701492537313
Recall: 0.4778761061946903
F1: 0.6000000000000001

Sort Data by Reconstruction Error

if reconstruction_error is not None:
    combined = list(zip(X_test, reconstruction_error))
    combined.sort(key = lambda x: x[1])

Show Top Autoencoder Outliers

if reconstruction_error is not None:
    n_rows = 10
    n_cols = 10
    n_samples = n_rows*n_cols

    samples = [c[0] for c in combined[-n_samples:]]

    fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(2*n_cols, 2*n_rows))
    for img, ax in zip(samples, axes.reshape(-1)):
        ax.axis('off')
        ax.imshow(img.reshape((28,28)), cmap='gray', vmin=0, vmax=1)

    plt.show()

Summary

  • Autoencoders are the most prominent reconstruction error based anomaly detection method.
  • Can provide high quality results on high dimensional data.
  • Architecture is highly adaptable to the data (fully connected, CNN, attention,...).
  • Sensitive to contamination.
  • Variational autoencoder are an important variant the improves the interpretability of the latent space.

Implementations

Snow